#include <iostream>
#include <chrono>
#include <cmath>
#include "cuda_utils.h"
#include "logging.h"
#include "common.hpp"
#include "utils.h"
#include "calibrator.h"

cv::Rect YOLOV5Net::get_rect(cv::Mat& img, float bbox[4]) {
	int l, r, t, b;
	float r_w = Yolo::INPUT_W / (img.cols * 1.0);
	float r_h = Yolo::INPUT_H / (img.rows * 1.0);
	if (r_h > r_w) {
		l = bbox[0] - bbox[2] / 2.f;
		r = bbox[0] + bbox[2] / 2.f;
		t = bbox[1] - bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
		b = bbox[1] + bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
		l = l / r_w;
		r = r / r_w;
		t = t / r_w;
		b = b / r_w;
	}
	else {
		l = bbox[0] - bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
		r = bbox[0] + bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
		t = bbox[1] - bbox[3] / 2.f;
		b = bbox[1] + bbox[3] / 2.f;
		l = l / r_h;
		r = r / r_h;
		t = t / r_h;
		b = b / r_h;
	}
	return cv::Rect(l, t, r - l, b - t);
}

float YOLOV5Net::iou(float lbox[4], float rbox[4]) {
	float interBox[] = {
		(std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left
		(std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right
		(std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top
		(std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom
	};

	if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
		return 0.0f;

	float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]);
	return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
}

bool YOLOV5Net::cmp(const Yolo::Detection& a, const Yolo::Detection& b) {
	return a.conf > b.conf;
}

void YOLOV5Net::nms(std::vector<Yolo::Detection>& res, float *output, float conf_thresh, float nms_thresh) {
	int det_size = sizeof(Yolo::Detection) / sizeof(float);
	std::map<float, std::vector<Yolo::Detection>> m;
	for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) {
		if (output[1 + det_size * i + 4] <= conf_thresh) continue;
		Yolo::Detection det;
		memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
		if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
		m[det.class_id].push_back(det);
	}
	for (auto it = m.begin(); it != m.end(); it++) {
		//std::cout << it->second[0].class_id << " --- " << std::endl;
		auto& dets = it->second;
		std::sort(dets.begin(), dets.end(), cmp);
		for (size_t m = 0; m < dets.size(); ++m) {
			auto& item = dets[m];
			res.push_back(item);
			for (size_t n = m + 1; n < dets.size(); ++n) {
				if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
					dets.erase(dets.begin() + n);
					--n;
				}
			}
		}
	}
}

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> YOLOV5Net::loadWeights(const std::string file) {
	std::cout << "Loading weights: " << file << std::endl;
	std::map<std::string, Weights> weightMap;

	// Open weights file
	std::ifstream input(file);
	assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");

	// Read number of weight blobs
	int32_t count;
	input >> count;
	assert(count > 0 && "Invalid weight map file.");

	while (count--)
	{
		Weights wt{ DataType::kFLOAT, nullptr, 0 };
		uint32_t size;

		// Read name and type of blob
		std::string name;
		input >> name >> std::dec >> size;
		wt.type = DataType::kFLOAT;

		// Load blob
		uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
		for (uint32_t x = 0, y = size; x < y; ++x)
		{
			input >> std::hex >> val[x];
		}
		wt.values = val;

		wt.count = size;
		weightMap[name] = wt;
	}

	return weightMap;
}

IScaleLayer* YOLOV5Net::addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
	float *gamma = (float*)weightMap[lname + ".weight"].values;
	float *beta = (float*)weightMap[lname + ".bias"].values;
	float *mean = (float*)weightMap[lname + ".running_mean"].values;
	float *var = (float*)weightMap[lname + ".running_var"].values;
	int len = weightMap[lname + ".running_var"].count;

	float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
	for (int i = 0; i < len; i++) {
		scval[i] = gamma[i] / sqrt(var[i] + eps);
	}
	Weights scale{ DataType::kFLOAT, scval, len };

	float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
	for (int i = 0; i < len; i++) {
		shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
	}
	Weights shift{ DataType::kFLOAT, shval, len };

	float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
	for (int i = 0; i < len; i++) {
		pval[i] = 1.0;
	}
	Weights power{ DataType::kFLOAT, pval, len };

	weightMap[lname + ".scale"] = scale;
	weightMap[lname + ".shift"] = shift;
	weightMap[lname + ".power"] = power;
	IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
	assert(scale_1);
	return scale_1;
}

ILayer* YOLOV5Net::convBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) {
	Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
	int p = ksize / 2;
	IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts);
	assert(conv1);
	conv1->setStrideNd(DimsHW{ s, s });
	conv1->setPaddingNd(DimsHW{ p, p });
	conv1->setNbGroups(g);
	IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);

	// silu = x * sigmoid
	auto sig = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID);
	assert(sig);
	auto ew = network->addElementWise(*bn1->getOutput(0), *sig->getOutput(0), ElementWiseOperation::kPROD);
	assert(ew);
	return ew;
}

ILayer* YOLOV5Net::focus(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) {
	ISliceLayer *s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
	ISliceLayer *s2 = network->addSlice(input, Dims3{ 0, 1, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
	ISliceLayer *s3 = network->addSlice(input, Dims3{ 0, 0, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
	ISliceLayer *s4 = network->addSlice(input, Dims3{ 0, 1, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
	ITensor* inputTensors[] = { s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0) };
	auto cat = network->addConcatenation(inputTensors, 4);
	auto conv = convBlock(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv");
	return conv;
}

ILayer* YOLOV5Net::bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) {
	auto cv1 = convBlock(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1");
	auto cv2 = convBlock(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2");
	if (shortcut && c1 == c2) {
		auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM);
		return ew;
	}
	return cv2;
}

ILayer* YOLOV5Net::bottleneckCSP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
	Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
	int c_ = (int)((float)c2 * e);
	auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
	auto cv2 = network->addConvolutionNd(input, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv2.weight"], emptywts);
	ITensor *y1 = cv1->getOutput(0);
	for (int i = 0; i < n; i++) {
		auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
		y1 = b->getOutput(0);
	}
	auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv3.weight"], emptywts);

	ITensor* inputTensors[] = { cv3->getOutput(0), cv2->getOutput(0) };
	auto cat = network->addConcatenation(inputTensors, 2);

	IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4);
	auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU);
	lr->setAlpha(0.1);

	auto cv4 = convBlock(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4");
	return cv4;
}

ILayer* YOLOV5Net::C3(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
	int c_ = (int)((float)c2 * e);
	auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
	auto cv2 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv2");
	ITensor *y1 = cv1->getOutput(0);
	for (int i = 0; i < n; i++) {
		auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
		y1 = b->getOutput(0);
	}

	ITensor* inputTensors[] = { y1, cv2->getOutput(0) };
	auto cat = network->addConcatenation(inputTensors, 2);

	auto cv3 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv3");
	return cv3;
}

ILayer* YOLOV5Net::SPP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) {
	int c_ = c1 / 2;
	auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");

	auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k1, k1 });
	pool1->setPaddingNd(DimsHW{ k1 / 2, k1 / 2 });
	pool1->setStrideNd(DimsHW{ 1, 1 });
	auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k2, k2 });
	pool2->setPaddingNd(DimsHW{ k2 / 2, k2 / 2 });
	pool2->setStrideNd(DimsHW{ 1, 1 });
	auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k3, k3 });
	pool3->setPaddingNd(DimsHW{ k3 / 2, k3 / 2 });
	pool3->setStrideNd(DimsHW{ 1, 1 });

	ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) };
	auto cat = network->addConcatenation(inputTensors, 4);

	auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
	return cv2;
}

std::vector<std::vector<float>> YOLOV5Net::getAnchors(std::map<std::string, Weights>& weightMap, std::string lname) {
	std::vector<std::vector<float>> anchors;
	Weights wts = weightMap[lname + ".anchor_grid"];
	int anchor_len = Yolo::CHECK_COUNT * 2;
	for (int i = 0; i < wts.count / anchor_len; i++) {
		auto *p = (const float*)wts.values + i * anchor_len;
		std::vector<float> anchor(p, p + anchor_len);
		anchors.push_back(anchor);
	}
	return anchors;
}

IPluginV2Layer* YOLOV5Net::addYoLoLayer(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, std::string lname, std::vector<IConvolutionLayer*> dets) {
	auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
	auto anchors = getAnchors(weightMap, lname);
	PluginField plugin_fields[2];
	int netinfo[4] = { Yolo::CLASS_NUM, Yolo::INPUT_W, Yolo::INPUT_H, Yolo::MAX_OUTPUT_BBOX_COUNT };
	plugin_fields[0].data = netinfo;
	plugin_fields[0].length = 4;
	plugin_fields[0].name = "netinfo";
	plugin_fields[0].type = PluginFieldType::kFLOAT32;
	int scale = 8;
	std::vector<Yolo::YoloKernel> kernels;
	for (size_t i = 0; i < anchors.size(); i++) {
		Yolo::YoloKernel kernel;
		kernel.width = Yolo::INPUT_W / scale;
		kernel.height = Yolo::INPUT_H / scale;
		memcpy(kernel.anchors, &anchors[i][0], anchors[i].size() * sizeof(float));
		kernels.push_back(kernel);
		scale *= 2;
	}
	plugin_fields[1].data = &kernels[0];
	plugin_fields[1].length = kernels.size();
	plugin_fields[1].name = "kernels";
	plugin_fields[1].type = PluginFieldType::kFLOAT32;
	PluginFieldCollection plugin_data;
	plugin_data.nbFields = 2;
	plugin_data.fields = plugin_fields;
	IPluginV2 *plugin_obj = creator->createPlugin("yololayer", &plugin_data);
	std::vector<ITensor*> input_tensors;
	for (auto det : dets) {
		input_tensors.push_back(det->getOutput(0));
	}
	auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj);
	return yolo;
}

//#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
//#define DEVICE 0  // GPU id
//#define NMS_THRESH 0.45
//#define CONF_THRESH 0.1
//#define BATCH_SIZE 1

// stuff we know about the network and the input/output blobs
//static const int INPUT_H = Yolo::INPUT_H;
//static const int INPUT_W = Yolo::INPUT_W;
//static const int CLASS_NUM = Yolo::CLASS_NUM;
//static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
//const char* INPUT_BLOB_NAME = "data";
//const char* OUTPUT_BLOB_NAME = "prob";
//static Logger gLogger;

//static int get_width(int x, float gw, int divisor = 8) {
//    return int(ceil((x * gw) / divisor)) * divisor;
//}
//
//static int get_depth(int x, float gd) {
//    if (x == 1) return 1;
//    int r = round(x * gd);
//    if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) {
//        --r;
//    }
//    return std::max<int>(r, 1);
//}

//ICudaEngine* build_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) {
//    INetworkDefinition* network = builder->createNetworkV2(0U);
//
//    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
//    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
//    assert(data);
//
//    std::map<std::string, Weights> weightMap = loadWeights(wts_name);
//
//    /* ------ yolov5 backbone------ */
//    auto focus0 = focus(network, weightMap, *data, 3, get_width(64, gw), 3, "model.0");
//    auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
//    auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
//    auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
//    auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(9, gd), true, 1, 0.5, "model.4");
//    auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
//    auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
//    auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7");
//    auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, 9, 13, "model.8");
//
//    /* ------ yolov5 head ------ */
//    auto bottleneck_csp9 = C3(network, weightMap, *spp8->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.9");
//    auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), get_width(512, gw), 1, 1, 1, "model.10");
//
//    auto upsample11 = network->addResize(*conv10->getOutput(0));
//    assert(upsample11);
//    upsample11->setResizeMode(ResizeMode::kNEAREST);
//    upsample11->setOutputDimensions(bottleneck_csp6->getOutput(0)->getDimensions());
//
//    ITensor* inputTensors12[] = { upsample11->getOutput(0), bottleneck_csp6->getOutput(0) };
//    auto cat12 = network->addConcatenation(inputTensors12, 2);
//    auto bottleneck_csp13 = C3(network, weightMap, *cat12->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.13");
//    auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), get_width(256, gw), 1, 1, 1, "model.14");
//
//	auto upsample15 = network->addResize(*conv14->getOutput(0));
//	//auto upsample15 = network->addResize(*conv14->getOutput(0));
//    assert(upsample15);
//    upsample15->setResizeMode(ResizeMode::kNEAREST);
//    upsample15->setOutputDimensions(bottleneck_csp4->getOutput(0)->getDimensions());
//
//    ITensor* inputTensors16[] = { upsample15->getOutput(0), bottleneck_csp4->getOutput(0) };
//    auto cat16 = network->addConcatenation(inputTensors16, 2);
//
//    auto bottleneck_csp17 = C3(network, weightMap, *cat16->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.17");
//
//    /* ------ detect ------ */
//    IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
//    auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 3, 2, 1, "model.18");
//    ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
//    auto cat19 = network->addConcatenation(inputTensors19, 2);
//    auto bottleneck_csp20 = C3(network, weightMap, *cat19->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.20");
//    IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
//    auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), get_width(512, gw), 3, 2, 1, "model.21");
//    ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
//    auto cat22 = network->addConcatenation(inputTensors22, 2);
//    auto bottleneck_csp23 = C3(network, weightMap, *cat22->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.23");
//    IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
//
//    auto yolo = addYoLoLayer(network, weightMap, "model.24", std::vector<IConvolutionLayer*>{det0, det1, det2});
//    yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
//    network->markOutput(*yolo->getOutput(0));
//
//    // Build engine
//    builder->setMaxBatchSize(maxBatchSize);
//    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
//#if defined(USE_FP16)
//    config->setFlag(BuilderFlag::kFP16);
//#elif defined(USE_INT8)
//    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
//    assert(builder->platformHasFastInt8());
//    config->setFlag(BuilderFlag::kINT8);
//    Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_BLOB_NAME);
//    config->setInt8Calibrator(calibrator);
//#endif
//
//    std::cout << "Building engine, please wait for a while..." << std::endl;
//    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
//    std::cout << "Build engine successfully!" << std::endl;
//
//    // Don't need the network any more
//    network->destroy();
//
//    // Release host memory
//    for (auto& mem : weightMap)
//    {
//        free((void*)(mem.second.values));
//    }
//
//    return engine;
//}
//
//ICudaEngine* build_engine_p6(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) {
//    INetworkDefinition* network = builder->createNetworkV2(0U);
//
//    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
//    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
//    assert(data);
//
//    //std::map<std::string, Weights> weightMap = loadWeights(wts_name);
//	std::map<std::string, Weights> weightMap = loadWeights("");
//
//    /* ------ yolov5 backbone------ */
//    auto focus0 = focus(network, weightMap, *data, 3, get_width(64, gw), 3, "model.0");
//    auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");
//    auto c3_2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");
//    auto conv3 = convBlock(network, weightMap, *c3_2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");
//    auto c3_4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(9, gd), true, 1, 0.5, "model.4");
//    auto conv5 = convBlock(network, weightMap, *c3_4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");
//    auto c3_6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");
//    auto conv7 = convBlock(network, weightMap, *c3_6->getOutput(0), get_width(768, gw), 3, 2, 1, "model.7");
//    auto c3_8 = C3(network, weightMap, *conv7->getOutput(0), get_width(768, gw), get_width(768, gw), get_depth(3, gd), true, 1, 0.5, "model.8");
//    auto conv9 = convBlock(network, weightMap, *c3_8->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.9");
//    auto spp10 = SPP(network, weightMap, *conv9->getOutput(0), get_width(1024, gw), get_width(1024, gw), 3, 5, 7, "model.10");
//    auto c3_11 = C3(network, weightMap, *spp10->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.11");
//
//    /* ------ yolov5 head ------ */
//    auto conv12 = convBlock(network, weightMap, *c3_11->getOutput(0), get_width(768, gw), 1, 1, 1, "model.12");
//    auto upsample13 = network->addResize(*conv12->getOutput(0));
//    assert(upsample13);
//    upsample13->setResizeMode(ResizeMode::kNEAREST);
//    upsample13->setOutputDimensions(c3_8->getOutput(0)->getDimensions());
//    ITensor* inputTensors14[] = { upsample13->getOutput(0), c3_8->getOutput(0) };
//    auto cat14 = network->addConcatenation(inputTensors14, 2);
//    auto c3_15 = C3(network, weightMap, *cat14->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.15");
//
//    auto conv16 = convBlock(network, weightMap, *c3_15->getOutput(0), get_width(512, gw), 1, 1, 1, "model.16");
//    auto upsample17 = network->addResize(*conv16->getOutput(0));
//    assert(upsample17);
//    upsample17->setResizeMode(ResizeMode::kNEAREST);
//    upsample17->setOutputDimensions(c3_6->getOutput(0)->getDimensions());
//    ITensor* inputTensors18[] = { upsample17->getOutput(0), c3_6->getOutput(0) };
//    auto cat18 = network->addConcatenation(inputTensors18, 2);
//    auto c3_19 = C3(network, weightMap, *cat18->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.19");
//
//    auto conv20 = convBlock(network, weightMap, *c3_19->getOutput(0), get_width(256, gw), 1, 1, 1, "model.20");
//    auto upsample21 = network->addResize(*conv20->getOutput(0));
//    assert(upsample21);
//    upsample21->setResizeMode(ResizeMode::kNEAREST);
//    upsample21->setOutputDimensions(c3_4->getOutput(0)->getDimensions());
//    ITensor* inputTensors21[] = { upsample21->getOutput(0), c3_4->getOutput(0) };
//    auto cat22 = network->addConcatenation(inputTensors21, 2);
//    auto c3_23 = C3(network, weightMap, *cat22->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.23");
//
//    auto conv24 = convBlock(network, weightMap, *c3_23->getOutput(0), get_width(256, gw), 3, 2, 1, "model.24");
//    ITensor* inputTensors25[] = { conv24->getOutput(0), conv20->getOutput(0) };
//    auto cat25 = network->addConcatenation(inputTensors25, 2);
//    auto c3_26 = C3(network, weightMap, *cat25->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.26");
//
//    auto conv27 = convBlock(network, weightMap, *c3_26->getOutput(0), get_width(512, gw), 3, 2, 1, "model.27");
//    ITensor* inputTensors28[] = { conv27->getOutput(0), conv16->getOutput(0) };
//    auto cat28 = network->addConcatenation(inputTensors28, 2);
//    auto c3_29 = C3(network, weightMap, *cat28->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.29");
//
//    auto conv30 = convBlock(network, weightMap, *c3_29->getOutput(0), get_width(768, gw), 3, 2, 1, "model.30");
//    ITensor* inputTensors31[] = { conv30->getOutput(0), conv12->getOutput(0) };
//    auto cat31 = network->addConcatenation(inputTensors31, 2);
//    auto c3_32 = C3(network, weightMap, *cat31->getOutput(0), get_width(2048, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.32");
//
//    /* ------ detect ------ */
//    IConvolutionLayer* det0 = network->addConvolutionNd(*c3_23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.0.weight"], weightMap["model.33.m.0.bias"]);
//    IConvolutionLayer* det1 = network->addConvolutionNd(*c3_26->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.1.weight"], weightMap["model.33.m.1.bias"]);
//    IConvolutionLayer* det2 = network->addConvolutionNd(*c3_29->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.2.weight"], weightMap["model.33.m.2.bias"]);
//    IConvolutionLayer* det3 = network->addConvolutionNd(*c3_32->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.3.weight"], weightMap["model.33.m.3.bias"]);
//
//    auto yolo = addYoLoLayer(network, weightMap, "model.33", std::vector<IConvolutionLayer*>{det0, det1, det2, det3});
//    yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
//    network->markOutput(*yolo->getOutput(0));
//
//    // Build engine
//    builder->setMaxBatchSize(maxBatchSize);
//    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
//#if defined(USE_FP16)
//    config->setFlag(BuilderFlag::kFP16);
//#elif defined(USE_INT8)
//    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
//    assert(builder->platformHasFastInt8());
//    config->setFlag(BuilderFlag::kINT8);
//    Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_BLOB_NAME);
//    config->setInt8Calibrator(calibrator);
//#endif
//
//    std::cout << "Building engine, please wait for a while..." << std::endl;
//    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
//    std::cout << "Build engine successfully!" << std::endl;
//
//    // Don't need the network any more
//    network->destroy();
//
//    // Release host memory
//    for (auto& mem : weightMap)
//    {
//        free((void*)(mem.second.values));
//    }
//
//    return engine;
//}

//void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, bool& is_p6, float& gd, float& gw, std::string& wts_name) {
//    // Create builder
//    IBuilder* builder = createInferBuilder(gLogger);
//    IBuilderConfig* config = builder->createBuilderConfig();
//
//    // Create model to populate the network, then set the outputs and create an engine
//    ICudaEngine *engine = nullptr;
//    if (is_p6) {
//        engine = build_engine_p6(maxBatchSize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
//    } else {
//        engine = build_engine(maxBatchSize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
//    }
//    assert(engine != nullptr);
//
//    // Serialize the engine
//    (*modelStream) = engine->serialize();
//
//    // Close everything down
//    engine->destroy();
//    builder->destroy();
//    config->destroy();
//}

//void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) {
//    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
//	double t1 = (double)cv::getTickCount();
//    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
//	t1 = ((double)cv::getTickCount() - t1) * 1000 / cv::getTickFrequency();
//	//std::cout << "t1: " << t1 << std::endl;
//
//	double t2 = (double)cv::getTickCount();
//    context.enqueue(batchSize, buffers, stream, nullptr);
//	t2 = ((double)cv::getTickCount() - t2) * 1000 / cv::getTickFrequency();
//	//std::cout << "t2: " << t2 << std::endl;
//
//	double t3 = (double)cv::getTickCount();
//    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
//	t3 = ((double)cv::getTickCount() - t3) * 1000 / cv::getTickFrequency();
//	//std::cout << "t3: " << t3 << std::endl;
//
//	double t4 = (double)cv::getTickCount();
//    cudaStreamSynchronize(stream);
//	t4 = ((double)cv::getTickCount() - t4) * 1000 / cv::getTickFrequency();
//	//std::cout << "t4: " << t4 << std::endl;
//}
//
//bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, bool& is_p6, float& gd, float& gw, std::string& img_dir) {
//    if (argc < 4) return false;
//    if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
//        wts = std::string(argv[2]);
//        engine = std::string(argv[3]);
//        auto net = std::string(argv[4]);
//        if (net[0] == 's') {
//            gd = 0.33;
//            gw = 0.50;
//        } else if (net[0] == 'm') {
//            gd = 0.67;
//            gw = 0.75;
//        } else if (net[0] == 'l') {
//            gd = 1.0;
//            gw = 1.0;
//        } else if (net[0] == 'x') {
//            gd = 1.33;
//            gw = 1.25;
//        } else if (net[0] == 'c' && argc == 7) {
//            gd = atof(argv[5]);
//            gw = atof(argv[6]);
//        } else {
//            return false;
//        }
//        if (net.size() == 2 && net[1] == '6') {
//            is_p6 = true;
//        }
//    } else if (std::string(argv[1]) == "-d" && argc == 4) {
//        engine = std::string(argv[2]);
//        img_dir = std::string(argv[3]);
//    } else {
//        return false;
//    }
//    return true;
//}

//int main(int argc, char** argv) {
//    cudaSetDevice(DEVICE);
//
//    std::string wts_name = "";
//    std::string engine_name = "";
//    bool is_p6 = false;
//    float gd = 0.0f, gw = 0.0f;
//    std::string img_dir;
//    if (!parse_args(argc, argv, wts_name, engine_name, is_p6, gd, gw, img_dir)) {
//        std::cerr << "arguments not right!" << std::endl;
//        std::cerr << "./yolov5 -s [.wts] [.engine] [s/m/l/x/s6/m6/l6/x6 or c/c6 gd gw]  // serialize model to plan file" << std::endl;
//        std::cerr << "./yolov5 -d [.engine] ../samples  // deserialize plan file and run inference" << std::endl;
//        return -1;
//    }
//
//    // create a model using the API directly and serialize it to a stream
//    if (!wts_name.empty()) {
//        IHostMemory* modelStream{ nullptr };
//        APIToModel(BATCH_SIZE, &modelStream, is_p6, gd, gw, wts_name);
//        assert(modelStream != nullptr);
//        std::ofstream p(engine_name, std::ios::binary);
//        if (!p) {
//            std::cerr << "could not open plan output file" << std::endl;
//            return -1;
//        }
//        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
//        modelStream->destroy();
//        return 0;
//    }
//
//    // deserialize the .engine and run inference
//    std::ifstream file(engine_name, std::ios::binary);
//    if (!file.good()) {
//        std::cerr << "read " << engine_name << " error!" << std::endl;
//        return -1;
//    }
//    char *trtModelStream = nullptr;
//    size_t size = 0;
//    file.seekg(0, file.end);
//    size = file.tellg();
//    file.seekg(0, file.beg);
//    trtModelStream = new char[size];
//    assert(trtModelStream);
//    file.read(trtModelStream, size);
//    file.close();
//
//    std::vector<std::string> file_names;
//    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
//        std::cerr << "read_files_in_dir failed." << std::endl;
//        return -1;
//    }
//
//    // prepare input data ---------------------------
    //static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
    //    data[i] = 1.0;
    //static float prob[BATCH_SIZE * OUTPUT_SIZE];
//    IRuntime* runtime = createInferRuntime(gLogger);
//    assert(runtime != nullptr);
//    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
//    assert(engine != nullptr);
//    IExecutionContext* context = engine->createExecutionContext();
//    assert(context != nullptr);
//    delete[] trtModelStream;
//    assert(engine->getNbBindings() == 2);
//    void* buffers[2];
//    // In order to bind the buffers, we need to know the names of the input and output tensors.
//    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
//    const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
//    const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
//    assert(inputIndex == 0);
//    assert(outputIndex == 1);
//    // Create GPU buffers on device
//    CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float)));
//    CUDA_CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
//    // Create stream
//    cudaStream_t stream;
//    CUDA_CHECK(cudaStreamCreate(&stream));
//
//    int fcount = 0;
//    for (int f = 0; f < (int)file_names.size(); f++) {
//        fcount++;
//        if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue;
//        for (int b = 0; b < fcount; b++) {
//            cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]);
//            if (img.empty()) continue;
//            cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H); // letterbox BGR to RGB
//            int i = 0;
//            for (int row = 0; row < INPUT_H; ++row) {
//                uchar* uc_pixel = pr_img.data + row * pr_img.step;
//                for (int col = 0; col < INPUT_W; ++col) {
//                    data[b * 3 * INPUT_H * INPUT_W + i] = (float)uc_pixel[2] / 255.0;
//                    data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
//                    data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
//                    uc_pixel += 3;
//                    ++i;
//                }
//            }
//        }
//
//        // Run inference
//        auto start = std::chrono::system_clock::now();
//        doInference(*context, stream, buffers, data, prob, BATCH_SIZE);
//        auto end = std::chrono::system_clock::now();
//        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
//        std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
//        for (int b = 0; b < fcount; b++) {
//            auto& res = batch_res[b];
//            nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH);
//        }
//        for (int b = 0; b < fcount; b++) {
//            auto& res = batch_res[b];
//            //std::cout << res.size() << std::endl;
//            cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]);
//            for (size_t j = 0; j < res.size(); j++) {
//                cv::Rect r = get_rect(img, res[j].bbox);
//                cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
//				std::cout << "img_dir: " << file_names[f - fcount + 1 + b] << std::endl;
//
//				std::cout << "label: " << res[j].class_id << std::endl;
//				std::cout << "conf: " << res[j].conf << std::endl;
//
//
//				//std::cout << "lx: " << r.x << " ly: " << r.y << std::endl;
//                //cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
//            }
//            cv::imwrite("./testResImg/_" + file_names[f - fcount + 1 + b], img);
//        }
//        fcount = 0;
//    }
//
//    // Release stream and buffers
//    cudaStreamDestroy(stream);
//    CUDA_CHECK(cudaFree(buffers[inputIndex]));
//    CUDA_CHECK(cudaFree(buffers[outputIndex]));
//    // Destroy the engine
//    context->destroy();
//    engine->destroy();
//    runtime->destroy();
//
//    // Print histogram of the output distribution
//    //std::cout << "\nOutput:\n\n";
//    //for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
//    //{
//    //    std::cout << prob[i] << ", ";
//    //    if (i % 10 == 0) std::cout << std::endl;
//    //}
//    //std::cout << std::endl;
//
//    return 0;
//}
